from typing import List
from pydantic import BaseModel, Field



from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import  GPTVectorStoreIndex,VectorStoreIndex
from llama_index.llms import openai_like
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding  # HuggingFaceEmbedding:用于将文本转换为词向量
from llama_index.llms.huggingface import HuggingFaceLLM  # HuggingFaceLLM：用于运行Hugging Face的预训练语言模型
from llama_index.core import Settings,SimpleDirectoryReader,VectorStoreIndex
import chromadb
from llama_index.embeddings.dashscope import DashScopeEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.llms.deepseek  import DeepSeek
from llama_index.embeddings.fastembed import FastEmbedEmbedding

from llama_index.core import QueryBundle

# import NodeWithScore
from llama_index.core.schema import NodeWithScore

# Retrievers
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)
    # 连接Chroma数据库


llm = DeepSeek(model="deepseek-chat", api_key="sk-605e60a1301040759a821b6b677556fb")
Settings.llm = llm
 
from zhipuai import ZhipuAI
from llama_index.embeddings.zhipuai import ZhipuAIEmbedding

embeddings = ZhipuAIEmbedding(
    model="embedding-2",
    api_key="f387f5e4837d4e4bba6d267682a957c9.PmPiTw8qVlsI2Oi5"
    # With the `embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)
Settings.embed_model=embeddings

 
from llama_index.core.node_parser import SentenceSplitter

entity_extractor = EntityExtractor(
    prediction_threshold=0.5,
    label_entities=False,  # include the entity label in the metadata (can be erroneous)
    device="cpu",  # set to "cuda" if you have a GPU
)

node_parser = SentenceSplitter()

transformations = [node_parser, entity_extractor]

from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./IPCC_AR6_WGII_Chapter03.pdf"]
).load_data()

from llama_index.core.ingestion import IngestionPipeline

import random

random.seed(42)
# comment out to run on all documents
# 100 documents takes about 5 minutes on CPU
documents = random.sample(documents, 100)

pipeline = IngestionPipeline(transformations=transformations)

nodes = pipeline.run(documents=documents)
samples = random.sample(nodes, 5)
for node in samples:
    print(node.metadata)

index = VectorStoreIndex(nodes=nodes)

query_engine = index.as_query_engine()
response = query_engine.query("What is said by Fox-Kemper?")
print(response)
for node in nodes:
    node.metadata.pop("entities", None)

print(nodes[0].metadata)

index = VectorStoreIndex(nodes=nodes)

query_engine = index.as_query_engine()
response = query_engine.query("What is said by Fox-Kemper?")
print(response)